wells <- read.csv("ecol 563/wells.txt") wells # logistic regression model out1 <- glm(cbind(y, n-y)~ land.use + sewer, data=wells, family=binomial) summary(out1) # tabulate data to show quasi-complete separation xtabs(cbind(y,n-y)~land.use, data=wells) # refit model to show individual logit means for land use out1a <- glm(cbind(y, n-y)~ land.use + sewer-1, data=wells, family=binomial) summary(out1a) # models are the same, just different parameterizations AIC(out1,out1a) # create binary version of the file for comparison # function to create binary response yn.func <- function(x) rep(c(1,0),c(x[1],x[2]-x[1])) # sample calculation apply(wells[1:4,1:2], 1, yn.func) # assemble data wells.raw <- data.frame(yn=unlist(apply(wells[,1:2], 1, yn.func)), land.use=rep(wells$land.use,wells$n), sewer=rep(wells$sewer,wells$n)) dim(wells.raw) wells.raw[1:8,] # fit same model this time with a binary response out1b <- glm(yn~ land.use + sewer, data=wells.raw, family=binomial) # estimates and standard errors are the same, likelihood is different summary(out1b) summary(out1) coef(out1b) coef(out1) #### methods for dealing with quasi-complete separation #### # Method 1: Firth regression--uses penalized ML library(logistf) out1f <- logistf(yn~ land.use + sewer, data=wells.raw, family=binomial) out1f # Method 2: treat problem group separately: calculate Clopper-Pearson intervals # sample run binom.test(2,76) # CI for land use=undeveloped binom.test(0,76) # Method 3: collapse categories xtabs(y~land.use, data=wells) # examine proportion of contaminated wells by land use category xtabs(y~land.use, data=wells)/sum(xtabs(y~land.use, data=wells)) # combine undeveloped with agricultural wells$land.use2 <- factor(ifelse(wells$land.use %in% c('agri','undev'), 'rural', as.character(wells$land.use))) xtabs(y~land.use2, data=wells) # refit model with new categories out2 <- glm(cbind(y,n-y)~land.use2+sewer, data=wells, family=binomial) summary(out2) #### simplifying the model: reduce the number of land use categories # fit model to get separate logits for each land use type out2b <- glm(cbind(y,n-y)~land.use2+sewer-1, data=wells, family=binomial) # examine estimates summary(out2b) coef(out2b) # display estimates in a dotplot library(lattice) # estimates appear to fall into three clusters dotplot(coef(out2b)[1:9]) # combine five the categories wells$land.use3 <- factor(ifelse(wells$land.use2 %in% c('inst','recr','resL','resM','trans'), 'mixed', as.character(wells$land.use2))) out2c <- glm(cbind(y,n-y)~land.use3+sewer, data=wells, family=binomial) # compare with previous model using AIC AIC(out2b, out2c) # models are nested: use LR test anova(out2c, out2b, test='Chisq') # try collapsing an additional three land use categories wells$land.use4 <- factor(ifelse(wells$land.use3 %in% c('resH','comm','indus'), 'high.use', as.character(wells$land.use3))) out2d <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=binomial) # compare all the land use models AIC(out2b, out2c, out2d) # LR test with previous model anova(out2d, out2c, test='Chisq') summary(out2d) # obtain estimates of individual land use logits out2e <- glm(cbind(y,n-y)~land.use4+sewer-1, data=wells, family=binomial) summary(out2e) # display confidence intervals confint(out2e) # predicted logits of individual observations predict(out2d) # predicted probabilities of individual observations predict(out2d, type='response') fitted(out2d) # obtain mean of each binomial distribution fitted(out2d)*wells$n wells$y # goodness of fit # collect the predicted successes and predicted failures in a matrix Ei <- cbind(fitted(out2d)*wells$n, wells$n-fitted(out2d)*wells$n) # too many predicted counts are small for chi-squared distribution to hold sum(Ei<=5)/length(Ei) # carry out Pearson test anyway Oi <- cbind(wells$y, wells$n-wells$y) sum((Oi-Ei)^2/Ei) # p-value for test 1-pchisq(sum((Oi-Ei)^2/Ei),nrow(wells)-length(coef(out2d))) # residual deviance can be used to check fit too # (except probably inappropriate here) summary(out2d) # p-value for residual deviance 1-pchisq(21.502,16)